import os
import shutil
from langchain.vectorstores import ElasticKnnSearch
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from configs.params import ESParams
from embedding import Embeddings
from langchain.vectorstores import Chroma
from embedding import Embeddings

CUR_DIR = os.path.dirname(__file__)



def load_file(filepath, chunk_size, chunk_overlap):
    loader = TextLoader(filepath, encoding='utf-8')
    documents = loader.load()
    text_splitter = CharacterTextSplitter(separator='\n', chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs


persist_directory = os.path.join(CUR_DIR, './data/vectorbase')
class VectorSearch:
    def __init__(self, embedding_model_path, chunk_size=500, chunk_overlap=0) -> None:
        self.embeddings = Embeddings(embedding_model_path) # 向量模型
        # self.text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        print(os.path.abspath(persist_directory))
        self.vectorbase = Chroma("MyKonwledge", self.embeddings, persist_directory=persist_directory)

    def doc_upload(self, file_obj, chunk_size, chunk_overlap):
        try:
            # if not self.client.indices.exists(index=self.es_params.index_name):
            #     dims = len(self.embedding.embed_query("test"))
            #     mapping = _default_knn_mapping(dims)
            #     self.client.indices.create(index=self.es_params.index_name, body={"mappings": mapping})
            filename = os.path.split(file_obj.name)[-1]
            file_path = 'data/' + filename
            find = self.vectorbase.get(where={'source': file_path})
            if len(find['ids']) > 0:
                return "文件[ {} ]已经存在".format(filename)
            shutil.move(file_obj.name, file_path)
            docs = load_file(file_path, chunk_size, chunk_overlap)
            self.vectorbase.add_documents(docs) # 修改成添加到vectorbase
            return "插入成功"
        except Exception as e:
            return e
    # def _splitDocument(self, texts):
    def doc_upload_test(self, chunk_size=300, chunk_overlap=0):
            filename = 'case.txt'
            file_path = 'data/' + filename
            find = self.vectorbase.get(where={'source': file_path})
            # print(find)
            if len(find['ids']) > 0:
                return "文件[ {} ]已经存在".format(filename)
            docs = load_file(file_path, chunk_size, chunk_overlap)

            self.vectorbase.add_documents(docs) # 修改成添加到vectorbase
        
    def doc_search(self, method, query, top_k, knn_boost):
        result = []
        docs = self.vectorbase.similarity_search(query)
        for doc in docs:
            doc = dict(doc)
            result.append({
                'content': doc['page_content'],
                'title': doc['metadata']['source'][5:]
            })
        return result


if __name__ == "__main__":
    es = VectorSearch("./model/m3e")
    res = es.doc_upload_test()
    print(res)
    print(es.doc_search("精确查询", "工程伦理", 3, 0.5))
